{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initial imports\n",
    "import numpy as np\n",
    "import pandas as pd \n",
    "from pandas import DataFrame, Series\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from scipy import stats\n",
    "%matplotlib inline\n",
    "\n",
    "import random\n",
    "import urllib.request\n",
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20\n",
      "40\n",
      "60\n",
      "80\n",
      "100\n",
      "120\n",
      "140\n",
      "160\n",
      "180\n",
      "200\n",
      "220\n"
     ]
    }
   ],
   "source": [
    "base_url = \"https://sofifa.com/players?offset=\"\n",
    "offset = 0\n",
    "columns = ['ID', 'Name', 'Age', 'Photo', 'Nationality', 'Flag', 'Overall', 'Potential', 'Club', \n",
    "           'Club Logo', 'Value', 'Wage', 'Special']\n",
    "data = DataFrame(columns=columns)\n",
    "for offset in range(225):\n",
    "    url = base_url + str(offset*80)\n",
    "    source_code = requests.get(url)\n",
    "    plain_text = source_code.text\n",
    "    soup = BeautifulSoup(plain_text)\n",
    "    table_body = soup.find('tbody')\n",
    "    for row in table_body.findAll('tr'):\n",
    "        td = row.findAll('td')\n",
    "        picture = td[0].find('img').get('data-src')\n",
    "        pid = td[0].find('img').get('id')\n",
    "        nationality = td[1].find('a').get('title')\n",
    "        flag_img = td[1].find('img').get('data-src')\n",
    "        name = td[1].findAll('a')[1].text\n",
    "        age = td[2].find('div').text.strip()\n",
    "        overall = td[3].text.strip()\n",
    "        potential = td[4].text.strip()\n",
    "        club = td[5].find('a').text\n",
    "        club_logo = td[5].find('img').get('data-src')\n",
    "        value = td[6].text.strip()\n",
    "        wage = td[7].text.strip()\n",
    "        special = td[8].text.strip()\n",
    "        player_data = DataFrame([[pid, name, age, picture, nationality, flag_img, overall, \n",
    "                                  potential, club, club_logo, value, wage, special]])\n",
    "        player_data.columns = columns\n",
    "        data = data.append(player_data, ignore_index=True)\n",
    "    offset+=1\n",
    "    data.to_csv('full_player_data.csv', encoding='utf-8')\n",
    "    if (offset % 20 == 0):\n",
    "        print(offset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_csv('Complete/basicplayerdata.csv', encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "player_data_url = 'https://sofifa.com/player/'\n",
    "for index, row in data.iterrows():\n",
    "    skill_names = []\n",
    "    skill_map = {'ID' : str(row['ID'])}\n",
    "    url = player_data_url + str(row['ID'])\n",
    "    source_code = requests.get(url)\n",
    "    plain_text = source_code.text\n",
    "    soup = BeautifulSoup(plain_text)\n",
    "    categories = soup.findAll('div', {'class': 'column col-4 mb-20'})\n",
    "    for category in categories[:-1]:\n",
    "        skills = category.findAll('li')\n",
    "        for skill in skills:\n",
    "            a = skill.text.split()\n",
    "            a.reverse()\n",
    "            value = a.pop()\n",
    "            a.reverse()\n",
    "            n = ' '.join(a)\n",
    "            skill_names.append(n)\n",
    "            skill_map[str(n)] = value\n",
    "    master_data = DataFrame(columns=skill_names)\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "master_data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "player_data_url = 'https://sofifa.com/player/'\n",
    "r = 0\n",
    "for index, row in data.iterrows():\n",
    "    skill_names = []\n",
    "    skill_map = {'ID' : str(row['ID'])}\n",
    "    url = player_data_url + str(row['ID'])\n",
    "    source_code = requests.get(url)\n",
    "    plain_text = source_code.text\n",
    "    soup = BeautifulSoup(plain_text)\n",
    "    categories = soup.findAll('div', {'class': 'column col-4 mb-20'})\n",
    "    for category in categories[:-1]:\n",
    "        skills = category.findAll('li')\n",
    "        for skill in skills:\n",
    "            a = skill.text.split()\n",
    "            a.reverse()\n",
    "            value = a.pop()\n",
    "            a.reverse()\n",
    "            n = ' '.join(a)\n",
    "            skill_names.append(n)\n",
    "            skill_map[str(n)] = value\n",
    "    attr_data = DataFrame(columns=skill_names)\n",
    "    for key in skill_map.keys():\n",
    "        attr_data.loc[r,key] = skill_map[key]\n",
    "    r = r + 1\n",
    "    master_data = pd.concat([master_data, attr_data])\n",
    "    if r % 1000 == 0:\n",
    "        print(r)\n",
    "        master_data.to_csv('Complete/PlayerAttributeData.csv', encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "master_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "full_data = pd.merge(data, master_data, left_index=True, right_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "master_data.to_csv('Complete/PlayerAttributeData.csv', encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "full_data.to_csv('Complete/Dataset.csv', encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "full_data.drop('Unnamed: 0', 1,  inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "full_data.drop('ID_x', 1,  inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "full_data['ID_y'][:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f = full_data.rename(index=str, columns={\"ID_y\": \"ID\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f['ID'][:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f.to_csv('Complete/Dataset.csv', encoding='utf-8')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
